This guide follows the Bioconductor RNA-Seq workflow to find differentially expressed genes using DESeq2. Load the hciR and pasilla packages to run the R code.

You can render this HTML report by finding the path to the [pasilla_DESeq.Rmd] file in inst/Rmd.

library(hciR)
rmd <- system.file("Rmd", "pasilla_DESeq.Rmd", package="hciR")
render(rmd, output_dir=".")

Load samples and counts

Load the pasilla counts and samples in the extdata directory using the readr package.

library(readr)
extdata <- system.file("extdata", package="pasilla")
counts  <- read_tsv(paste(extdata, "pasilla_gene_counts.tsv", sep="/"))
samples <- read_csv(paste(extdata, "pasilla_sample_annotation.csv", sep="/"))
samples
#  # A tibble: 7 x 6
#            file condition        type `number of lanes` `total number of reads` `exon counts`
#           <chr>     <chr>       <chr>             <int>                   <chr>         <int>
#  1   treated1fb   treated single-read                 5                35158667      15679615
#  2   treated2fb   treated  paired-end                 2           12242535 (x2)      15620018
#  3   treated3fb   treated  paired-end                 2           12443664 (x2)      12733865
#  4 untreated1fb untreated single-read                 2                17812866      14924838
#  5 untreated2fb untreated single-read                 6                34284521      20764558
#  6 untreated3fb untreated  paired-end                 2           10542625 (x2)      10283129
#  7 untreated4fb untreated  paired-end                 2           12214974 (x2)      11653031

Remove 2240 features with zero counts and 1323 with one or fewer reads in any sample.

counts <- filter_counts(counts)
#  Removed 2240 features with 0 reads
#  Removed 1323 features with <=1 maximum reads
counts
#  # A tibble: 11,036 x 8
#         gene_id untreated1 untreated2 untreated3 untreated4 treated1 treated2 treated3
#           <chr>      <int>      <int>      <int>      <int>    <int>    <int>    <int>
#   1 FBgn0000008         92        161         76         70      140       88       70
#   2 FBgn0000014          5          1          0          0        4        0        0
#   3 FBgn0000015          0          2          1          2        1        0        0
#   4 FBgn0000017       4664       8714       3564       3150     6205     3072     3334
#   5 FBgn0000018        583        761        245        310      722      299      308
#   6 FBgn0000024         10         11          3          3       10        7        5
#   7 FBgn0000032       1446       1713        615        672     1698      696      757
#   8 FBgn0000036          2          1          0          0        1        0        1
#   9 FBgn0000037         15         25          9          5       20       14       17
#  10 FBgn0000042     101664     120163      45880      53201   127363    76099    83164
#  # ... with 11,026 more rows

Run DESeq

To run the DESeq2 wrapper for tibbles, the names in the first sample column should match the count column names, so fix the file name (the names should be identical, but the order does not matter).

samples$file <- gsub("fb$", "", samples$file )

Combine the counts and samples to create a DESeqDataSet object and calculate the regularized log transform (rlog) for sample visualizations.

dds <- deseq_from_tibble(counts, samples,  design = ~ condition)
#  Reordering columns in counts to match samples
#  estimating size factors
#  estimating dispersions
#  gene-wise dispersion estimates
#  mean-dispersion relationship
#  final dispersion estimates
#  fitting model and testing
rld <- r_log(dds)

PCA plot

Plot the first two principal components using the rlog values from the top 500 variable genes. You can hover over points to view sample names or zoom into groups of points in this interactive highchart.

plot_pca(rld, "condition", tooltip=c("file", "type") , width=700)

Cluster all the rlog values using the R function dist to calculate the Euclidean distance between samples.

plot_dist(rld , c("condition", "type"), palette="Blues", diagNA=FALSE, reverse_pal=TRUE)

Gene annotations

Load fruitfly annotations from Ensembl.

fly <- read_biomart("fly")
#  Downloaded 19663 results, grouping into 17559 rows with a unique Ensembl ID
fly
#  # A tibble: 17,559 x 10
#              id      gene_name        biotype chromosome    start      end strand
#           <chr>          <chr>          <chr>      <chr>    <int>    <int>  <int>
#   1 FBgn0000003 7SLRNA:CR32864        lincRNA         3R  6822498  6822796      1
#   2 FBgn0000008              a protein_coding         2R 22136968 22172834      1
#   3 FBgn0000014          abd-A protein_coding         3R 16807214 16830049     -1
#   4 FBgn0000015          Abd-B protein_coding         3R 16927210 16972236     -1
#   5 FBgn0000017            Abl protein_coding         3L 16615866 16647882     -1
#   6 FBgn0000018            abo protein_coding         2L 10973443 10975293     -1
#   7 FBgn0000022             ac protein_coding          X   370031   370947      1
#   8 FBgn0000024            Ace protein_coding         3R 13222951 13259517     -1
#   9 FBgn0000028           acj6 protein_coding          X 15350239 15379064     -1
#  10 FBgn0000032         Acph-1 protein_coding         3R 29991144 29993411     -1
#  # ... with 17,549 more rows, and 3 more variables: description <chr>, transcript_count <int>, entrez_gene <chr>

Result tables

Get the annotated DESeq results using a 5% false discovery rate (FDR). The default is to compare all treatment combinations, which only includes one in this study.

Since we downloaded the most recent fly annotations, there are 723 missing (note you could set the version in read_biomart to match the old count table)

res <- results_all(dds, fly, alpha= 0.05)
#  Using adjusted p-value < 0.05
#  1. treated vs. untreated: 408 up and 433 down regulated
#  Note: 723 rows in results are missing from biomart table

Browse results

Create a flex dashboard using the top 2000 genes sorted by p-value in pasilla_flex.html. The MA-plot and volcano plot are linked, so you can click and drag to create a box to highlight points and then view matching rows in the table. You can also drag the box around the plot to easily highlight other points and rows. In addition, you can search for genes in the table using the search box, but need to click on the row in order to highlight the points in the plots. The sliders can also be used to limit the results.

rmd <- system.file("Rmd", "DESeq_flex.Rmd", package="hciR")
render(rmd, output_file="pasilla_flex.html",  output_dir=".",
         params=list( results= res, title = "Pasilla treated vs. untreated", top= 2000 ))

Save results

Save the DESeq results to a single Excel file in DESeq.xlsx. The write_deseq function will also output raw counts, rlog values, normalized counts, samples and fly annotations.

write_deseq(res, dds, rld, fly)

Plots

Plot the fold changes and p-values in an interactive volcano plot.

plot_volcano(res, padj = 1e-10, log2Fold =1 , width=700)
#  Adding mouseover labels to 353 genes (4.2%)
#  Missing 38 gene names, using Ensembl IDs instead


Select the top 40 genes sorted by p-value and cluster the rlog differences, so values in the heatmap represent the amount a gene deviates in a specific sample from the gene’s average across all samples.


x <- top_counts( res, rld, top=40)
x
#  # A tibble: 40 x 8
#         id  treated1  treated2  treated3 untreated1 untreated2 untreated3 untreated4
#      <chr>     <dbl>     <dbl>     <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
#   1   Kal1  7.452024  7.606977  7.661932   9.793234   9.719801   9.627366   9.709257
#   2   Ant2 10.845886 11.038574 11.070029   9.276907   9.211766   9.345823   9.305620
#   3    Hml 10.855296 10.875146 10.775652  12.113658  12.106649  12.180227  12.094599
#   4   sesB 10.522534 10.545875 10.498051  12.515862  12.419081  12.151191  12.248099
#   5 CG3770  8.133921  8.266383  8.270252   9.607688   9.464912   9.572734   9.612732
#   6 CG1544  6.490788  6.594402  6.716758   8.189241   8.106000   8.310342   8.276030
#   7 CG6018  6.512603  6.673197  6.654972   7.902767   8.006466   8.154444   7.987397
#   8 CG3168  7.952302  7.894715  7.926235   9.047183   9.254243   9.258307   9.137957
#   9    Ama  8.714037  8.812205  8.869148   7.239193   7.416696   7.502398   7.531298
#  10   LpR2  7.542688  7.585596  7.597285   6.554402   6.541649   6.609272   6.551323
#  # ... with 30 more rows
plot_genes(x, c("condition", "type") )


Plot the top 400 genes using an interactive d3heatmap. Click and drag over a region in the plot to zoom and better view gene labels.

plot_genes( top_counts( res, rld, top=400), output="d3", xaxis_font_size=12, show_grid=FALSE)